#### setting environment ####
require(fanc)
require(psych)
require(stm)
require(quanteda)

newspaper.names <- c("Asahi", "Chugoku", "Chunichi", "Hokkaido", "Kahoku", 
                     "Mainichi", "Nikkei", "Nishinippon", "Sankei", "Yomiuri")

#### data preparation ####
# estimation results of the 65-topic model
load("STM_result_65.Rdata")
# estimation results of the Wordfish models
load("Study2_Wordfish_result.Rdata")
# estimation results of the factor analysis models
load("FA_result.Rdata")

#### estimates by the unidimensional factor analysis model ####
## newspapers' ideal points
# posterior draws
Study2.ideal.points.draws <- t(rbind(FA.result$mcmc[[1]][, 1:10], 
                                     FA.result$mcmc[[2]][, 1:10], 
                                     FA.result$mcmc[[3]][, 1:10])) * 10
# point estimates (posterior means)
Study2.ideal.points.mean <- rowMeans(Study2.ideal.points.draws)

## factor loadings
# posterior draws
factor.loadings.draws <- t(rbind(FA.result$mcmc[[1]][, 11:75], 
                                 FA.result$mcmc[[2]][, 11:75], 
                                 FA.result$mcmc[[3]][, 11:75])) / 10
# point estimates (posterior means)
factor.loadings.mean <- rowMeans(factor.loadings.draws)

#### estimate the multidimensional factor analysis model (Figure A.7) ####
## estimate the model varing initial values
# CAUTION: it may take a long time to complete this procedure depending on the computational environment
# you may skip this procedure and load a .Rdata file containing the estimation results
multidimensional.FA.results <- list()
for (i in 1:30) {
  set.seed(100 * i)
  multidimensional.FA.results[[i]] <- fanc(topic.specific.positions, factors = 9)
  print(paste0("Estimation of model ", i, " is finished at ", date(), "."))
}
# save(multidimensional.FA.results, file = "multidimensional_FA_results.Rdata")
# load("multidimensional_FA_results.Rdata")

## select the best model by the Bayesian information criterion
select.multidimensional.FA.results <- list()
multidimensional.FA.results.BIC <- matrix(NA, 30, 9)
for (i in 1:30) {
  select.multidimensional.FA.results[[i]] <- list()
  for (j in 1:length(multidimensional.FA.results[[i]]$gamma)) {
    select.multidimensional.FA.results[[i]][[j]] <- select(multidimensional.FA.results[[i]], 
                                                           gamma = multidimensional.FA.results[[i]]$gamma[j])
    multidimensional.FA.results.BIC[i, j] <- select.multidimensional.FA.results[[i]][[j]]$BIC
  }
}
multidimensional.FA.results.BIC
which.min(multidimensional.FA.results.BIC)

#### compare the estimated factor loadings scores with the results of the unidimensional model (Figure A.3) ####
## factor loadings
multidimensional.FA.factor.loadings <- select.multidimensional.FA.results[[23]][[9]]$loadings

## proportion of variance explained
SS.loadings <- apply(multidimensional.FA.factor.loadings, 2, function(x) sum(x ^ 2)) / 
  sum(apply(multidimensional.FA.factor.loadings, 2, function(x) sum(x ^ 2)))
round(SS.loadings, 3)

SS.loadings.order <- order(SS.loadings, decreasing = TRUE)

## factor scores
multidimensional.FA.factor.scores <- factor.scores(topic.specific.positions, 
                                                   as.matrix(multidimensional.FA.factor.loadings), 
                                                   method = "Bartlett")$scores
rownames(multidimensional.FA.factor.scores) <- newspaper.names

## draw the figure
cairo_pdf("Figure_A7.pdf",
          width = 5.6, height = 3, pointsize = 7, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2))
plot(multidimensional.FA.factor.scores[, SS.loadings.order[1]], Study2.ideal.points.mean, 
     pch = 19, xlim = c(-1.5, 2.5), ylim = c(-1, 1.5), 
     main = "Factor Scores", xlab = "Scores by the Multidimensional Factor Model", 
     ylab = "Scores by the Main Analysis", xaxt = "n")
text(multidimensional.FA.factor.scores[, SS.loadings.order[1]], Study2.ideal.points.mean, 
     labels = newspaper.names, pos = c(3, 1, 1, 4, 4, 3, 3, 3, 3, 3), cex = 0.6)
axis(1, at = -1:2, labels = c("-1.0", "0.0", "1.0", "2.0"))
plot(multidimensional.FA.factor.loadings[, SS.loadings.order[1]], factor.loadings.mean, 
     pch = 21, col = NA, bg = "#00000030", 
     xlim = c(-1, 1), ylim = c(-1.5, 1.5), 
     main = "Factor Loadings", xlab = "Loadings by the Multidimensional Factor Model", 
     ylab = "Loadings by the Main Analysis")
dev.off()

## correlation between the two factor scores
round(cor(multidimensional.FA.factor.scores[, SS.loadings.order[1]], Study2.ideal.points.mean), 3)

## correlation between the two factor loadings
round(cor(multidimensional.FA.factor.loadings[, SS.loadings.order[1]], factor.loadings.mean), 3)

#### second dimension (Figure A.8 and Table A.3) ####
second.dim.score.order <- order(multidimensional.FA.factor.scores[, SS.loadings.order[2]], decreasing = TRUE)

## draw the figure
cairo_pdf("Figure_A8.pdf", 
          width = 5, height = 6, pointsize = 11, family = "Helvetica")
par(mar = c(5, 1, 1, 1))
dotchart(multidimensional.FA.factor.scores[second.dim.score.order, SS.loadings.order[2]], 
         pch = 19, xlim = c(-2, 2), xlab = "Estimated Scores")
dev.off()

## topics whose absolute value of the factor loading is in the top ten
second.dim.loading.order <- order(abs(multidimensional.FA.factor.loadings[, SS.loadings.order[2]]), decreasing = TRUE)

# topic labels and factor loadings
topics <- c("Constitutional revision", "The 2017 general election", "North Korea", "Nuclear power plants", 
            "U.S.-Iran relationship", "Official document", "Diet deliberations", "Marine Corps Air Station Futenma", 
            "Prime minister's scandals", "National security", "Ministry of Finance", "Japan-Korea relationship", 
            "Local government", "International nuclear disarmament", "Criminal trial", "Tax and pension", 
            "China's diplomatic policy", "Trade", "Labor law", "Civil Code reform", "Broadcasting", "Syria", 
            "Integrated Resort Bill", "Employment", "Freedom of speech", "Financial policy", 
            "Israeli-Palestinian conflict", "Entrance examination reform", "National finance", 
            "Corporate governance", "Antismoking bill", "Electoral systems", "Bank", "Olympic Games", 
            "Nuclear fuel", "Nursery school", "Medicine", "Sumo", "Manufacturing scandals", 
            "Agriculture, forestry, and fisheries", "Cryptocurrency", "Child welfare", "Education", "History", 
            "State budget", "Industrial relations", "Chinese politics", "Electric power", "Culture", "MEXT", 
            "Immigration", "Harassment in sports", "Enormous rain disaster", "European politics", 
            "Employment of the disabled", "Science", "Japan-Russia relationship", "Crime", "Natural disaster", 
            "Regional revitalization", "Administrative lawsuit", "Imperial Household", "Computer game", 
            "Sports", "Traffic")
topic.labels <- topics[66 - rank(abs(factor.loadings.mean))]

data.frame(topic = topic.labels[second.dim.loading.order[1:10]], 
           loading = round(multidimensional.FA.factor.loadings[second.dim.loading.order[1:10], SS.loadings.order[2]], 2))

# top five frequent and exclusive terms
frex.terms.related <- labelTopics(STM.result, n = 5)$frex[second.dim.loading.order[1:10], ]
print(frex.terms.related, quote = FALSE)

# terms frequently used by left and right newspapers
negative.terms.related <- positive.terms.related <- matrix(NA, 10, 5)
for (i in 1:10) {
  # Wordfish result of the i-th most related topic
  Wordfish.result <- Study2.wordfish.result[[second.dim.loading.order[i]]]
  # threshold of psi to evaluate whether each term is frequently used in general
  psi.threshold <- quantile(Wordfish.result$psi, 0.75)
  # ascending order of term's coefficients
  term.coef.ranking <- order(Wordfish.result$beta[Wordfish.result$psi > psi.threshold])
  # top five terms.related frequently used by left newspapers
  negative.terms.related[i, ] <- (Wordfish.result$features[Wordfish.result$psi > psi.threshold])[head(term.coef.ranking, 5)]
  # top five terms.related frequently used by right newspapers
  positive.terms.related[i, ] <- (Wordfish.result$features[Wordfish.result$psi > psi.threshold])[rev(tail(term.coef.ranking, 5))]
}
print(negative.terms.related, quote = FALSE)
print(positive.terms.related, quote = FALSE)